library(reshape2) # melt function
library(ggplot2) # ggplot function
library(pcaPP) # Fast Kendall function
library(energy) # Distance Correlation
library(Hmisc) # Hoeffding's D measure
## Loading required package: lattice
## Loading required package: survival
## Loading required package: Formula
## 
## Attaching package: 'Hmisc'
## The following objects are masked from 'package:base':
## 
##     format.pval, units
library(entropy) # Mutual Information
library(minerva) # Maximum Information Coefficient
library(XICOR) # Chatterjee's Coefficient
library(dHSIC) # Hilbert Schmidt Independence Criterion
library(VineCopula) # Blomqvist's Beta

source("data_generation.R")
my_beta <- function(x, y){
    # following Blomqvist, N. (1950).  On a measure of dependence between two
    # random variables. The Annals of Mathematical Statistics, 21(4), 593-600.
    mx <- median(x); my <- median(y)
    n1 <- sum(x < mx & y > my) +  sum(x > mx & y < my)
    n2 <- sum(x < mx & y < my) +  sum(x > mx & y > my)
    
    return ((n1 - n2)/(n1 + n2))
 
}

my_XICOR <- function(x, y){
  return(max(XICOR::calculateXI(x,y), XICOR::calculateXI(y,x)))
}
generate_quadratic <- function(n){
  t(sapply(1:n, function(i){
    x <- stats::runif(1, min = -1, max = 1)
    y <- x^2
    c(x,y)
  }))
}

generate_quadratic2 <- function(n){
  t(sapply(1:n, function(i){
    x <- stats::runif(1, min = -1, max = 1)
    y <- x^2 + stats::rnorm(1, mean = 0, sd = 0.3)
    c(x,y)
  }))
}

gen_vertical1 <- function(n){
  t(sapply(1:n, function(i){
    x <- stats::runif(1, min = -2, max = 2)
    y <- 0 
    c(x,y)
  }))
}

gen_vertical2 <- function(n){
  t(sapply(1:n, function(i){
    x <- 0
    y <- stats::runif(1, min = -2, max = 2)
    c(x,y)
  }))
}

generate_power <- function(n){
  t(sapply(1:n, function(i){
    x <- stats::runif(1, min = 0, max = 2)
    y <- x^10 + stats::rnorm(1, mean = 10, sd = 10)
    c(x,y)
  }))
}

gen_linear <- function(n){
  t(sapply(1:n, function(i){
    # generate "cluster" assignment
    x <- stats::rnorm(1, mean = 2.5, sd = 1)
    y <- x + stats::rnorm(1, mean = 0, sd = 1)
    c(x,y)
  }))
}

gen_strong_linear <- function(n){
  t(sapply(1:n, function(i){
    # generate "cluster" assignment
    x <- stats::rnorm(1, mean = 2.5, sd = 1)
    y <- x + stats::rnorm(1, mean = 0, sd = 0.3)
    c(x,y)
  }))
}

gen_sine <- function(n){
  t(sapply(1:n, function(i){
    x <- stats::runif(1, min = -2, max = 12)
    y <- sin(x) + stats::rnorm(1, mean = 0, sd = 0.1)
    c(x,y)
  }))
}
quadratic_1 <- generate_quadratic(100)
plot(quadratic_1)

exper_n <- 500

ver1 <- gen_vertical1(exper_n)
plot(ver1)

ver2 <- gen_vertical2(exper_n)
plot(ver2)

cross <- rbind(ver1, ver2)
plot(cross)

power <- generate_power(exper_n)
plot(power)

quadratic2 <- generate_quadratic2(400)
plot(quadratic2)

sine <- gen_sine(exper_n)
plot(sine)

Dcor vs Pearson

plot(quadratic_1)

energy::dcor(quadratic_1[,1],quadratic_1[,2])
## [1] 0.5102642
stats::cor(quadratic_1, method = "pearson")
##            [,1]       [,2]
## [1,]  1.0000000 -0.2608609
## [2,] -0.2608609  1.0000000
#stats::cor(quadratic_1, method = "spearman")

Things to consider

lollipop <- .generate_lollipop(500)
plot(lollipop)

stats::cor(lollipop)
##           [,1]      [,2]
## [1,] 1.0000000 0.8452204
## [2,] 0.8452204 1.0000000
energy::dcor(lollipop[,1],lollipop[,2])
## [1] 0.8798029

XI vs Spearman

Linear

lin <- gen_linear(200)
plot(lin)

my_XICOR(lin[,1], lin[,2])
## [1] 0.2872572
stats::cor(lin, method = "spearman")
##           [,1]      [,2]
## [1,] 1.0000000 0.7313043
## [2,] 0.7313043 1.0000000

Quadratic

my_XICOR(quadratic_1[,1], quadratic_1[,2])
## [1] 0.9411941
stats::cor(quadratic_1, method = "spearman")
##            [,1]       [,2]
## [1,]  1.0000000 -0.2044164
## [2,] -0.2044164  1.0000000

Sine

my_XICOR(sine[,1], sine[,2])
## [1] 0.8224473
stats::cor(sine, method = "spearman")
##             [,1]        [,2]
## [1,]  1.00000000 -0.06390755
## [2,] -0.06390755  1.00000000

Hoeffding’s D vs XI

quadratic

Hmisc::hoeffd(sine[,1], sine[,2])$D
##            x          y
## x 1.00000000 0.05549161
## y 0.05549161 1.00000000
my_XICOR(sine[,1], sine[,2])
## [1] 0.8224473
plot(quadratic_1)

Hmisc::hoeffd(quadratic_1[,1], quadratic_1[,2])$D
##           x         y
## x 1.0000000 0.2523113
## y 0.2523113 1.0000000
my_XICOR(quadratic_1[,1], quadratic_1[,2])
## [1] 0.9411941

Quadratic with noise

plot(quadratic2)

Hmisc::hoeffd(quadratic2[,1], quadratic2[,2])$D
##            x          y
## x 1.00000000 0.04664275
## y 0.04664275 1.00000000
my_XICOR(quadratic2[,1], quadratic2[,2])
## [1] 0.3069769

Beta vs Spearman

steep exponential

plot(power)

stats::cor(power, method = "spearman")
##           [,1]      [,2]
## [1,] 1.0000000 0.7491827
## [2,] 0.7491827 1.0000000
my_beta(power[,1], power[,2])
## [1] -0.592
gen_quadrant <- function(n){
  t(sapply(1:n, function(i){
    rand_int <- sample(1:4, 1)
    if (rand_int == 1){
      x <- stats::runif(1, min = 0, max = 3)
      y <- stats::runif(1, min = 0, max = 3)
      
    } else if (rand_int == 2){
      x <- stats::runif(1, min = 0, max = 4)
      y <- stats::runif(1, min = 10, max = 14)
      
    } else if (rand_int == 3){
      x <- stats::runif(1, min = 10, max = 14)
      y <- stats::runif(1, min = 0, max = 4)
      
    } else {
      x <- stats::runif(1, min = 10, max = 14)
      y <- stats::runif(1, min = 10, max = 14)
    }
    c(x,y)
  }))
}

quadrant <- gen_quadrant(exper_n)

mean(quadrant[, 1])
## [1] 6.540044
mean(quadrant[, 2])
## [1] 6.86236
plot(quadrant)

Beta vs MI

my_beta(quadrant[, 1], quadrant[, 2])
## [1] -0.104
mi.empirical(entropy::discretize2d(as.matrix(quadrant[, 1]), as.matrix(quadrant[, 2]), numBins1 = 20, numBins2 = 20))
## Warning in KL.plugin(freqs2d, freqs.null, unit = unit): Vanishing value(s) in
## argument freqs2!
## [1] 0.1956087

Beta vs Kendall

plot(quadrant)

my_beta(quadrant[, 1], quadrant[, 2])
## [1] -0.104
stats::cor(quadrant, method = "kendall")
##            [,1]       [,2]
## [1,] 1.00000000 0.07547896
## [2,] 0.07547896 1.00000000
#minerva::mine(quadrant)$MIC
plot(lin)

my_beta(lin[, 1], lin[, 2])
## [1] -0.56
stats::cor(lin, method = "kendall")
##           [,1]      [,2]
## [1,] 1.0000000 0.5319598
## [2,] 0.5319598 1.0000000
#minerva::mine(quadrant)$MIC

MIC vs MI

plot(quadratic_1)

minerva::mine(quadratic_1)$MIC
##      [,1] [,2]
## [1,]    1    1
## [2,]    1    1
mi.empirical(entropy::discretize2d(as.matrix(quadratic_1[, 1]), as.matrix(quadratic_1[, 2]), numBins1 = 10, numBins2 = 10))
## [1] 1.306421
#minerva::mine(quadrant)$MIC

2. Explore more about the lollipop

get_measures <- function(dat){
  pearson_cor <- stats::cor(dat, method = "pearson")[1,2]
  spearman_cor <- stats::cor(dat, method = "spearman")[1,2]
  kendall_cor <- pcaPP::cor.fk(dat)[1,2]
  dist_cor <- energy::dcor(dat[,1], dat[,2])
  hoeff_cor <- Hmisc::hoeffd(dat[,1], dat[,2])$D[1,2]
  
  MI_cor <- entropy::mi.empirical(entropy::discretize2d(as.matrix(dat[, 1]),
                                                           as.matrix(dat[, 2]),
                                                           numBins1 = 10, numBins2 = 10))
  MIC_cor <- minerva::mine(dat)$MIC[1,2]
  XICOR_cor <- my_XICOR(dat[,1], dat[,2])
  HSIC_cor <- dHSIC::dhsic(dat[,1], dat[, 2])$dHSIC
  beta_cor <- my_beta(dat[, 1], dat[, 2])
  return(c(pearson_cor, spearman_cor, kendall_cor, dist_cor, hoeff_cor,
           MI_cor, MIC_cor, XICOR_cor, HSIC_cor, beta_cor))
}

2-1. Original lollipop

original <- .generate_lollipop
set.seed(11)
lollipop1 <- original(500)
plot(lollipop1)

lolli1_measure <- get_measures(lollipop1)

2-2. Three clusters in total

new_lollipop2 <- function(n){
  t(sapply(1:n, function(i){
    # generate "cluster" assignment
    k <- sample(1:3, 1)
    
    # generate "ball"
    if(k == 1){
      x <- stats::rnorm(1, mean = 0, sd = 1)
      y <- stats::rnorm(1, mean = 0, sd = 1)
    
      # generate "bridge"
    } else if (k == 2){
      x <- stats::rnorm(1, mean = 1, sd = 0.5)
      y <- stats::rnorm(1, mean = 2, sd = 0.5)
      
    } else {
      # k == 3, generate "stick"
      x <- stats::rnorm(1, mean = 3, sd = 0.8)
      y <- x + stats::rnorm(1, mean = 1, sd = 0.5)
    } 
    
    c(x,y)
  }))
}
set.seed(10)
new_pop2 <- new_lollipop2(500)
plot(new_pop2)

lolli2_measure <- get_measures(new_pop2)

2-3. Two clusters that continously connect the circle and stick

set.seed(10)
new_lollipop3 <- function(n){
  t(sapply(1:n, function(i){
    # generate "cluster" assignment
    k <- sample(1:2, 1)
    
    # generate "ball"
    if(k == 1){
      x <- stats::rnorm(1, mean = 0, sd = 1)
      y <- stats::rnorm(1, mean = 0, sd = 1)
    } else {
      # k == 2, generate "stick"
      x <- stats::rnorm(1, mean = 1.5, sd = 0.8)
      y <- x + stats::rnorm(1, mean = 0.5, sd = 0.5)
    }
        
    c(x,y)
  }))
}
new_pop3 <- new_lollipop3(500)
plot(new_pop3)

stats::cor(new_pop3, method = "pearson")[1,2]
## [1] 0.6320267
energy::dcor(new_pop3[,1],new_pop3[,2])
## [1] 0.6309555
lolli3_measure <- get_measures(new_pop3)

2-4.The lollipop that has a very long stick

set.seed(15)
new_lollipop4 <- function(n){
  t(sapply(1:n, function(i){
    # generate "cluster" assignment
    k <- sample(1:2, 1)
    
    # generate "ball"
    if(k == 1){
      x <- stats::rnorm(1, mean = 0, sd = 1)
      y <- stats::rnorm(1, mean = 0, sd = 1)
      
    } else {
      # k == 2, generate "stick"
      x <- stats::rnorm(1, mean = 6, sd = 4)
      y <- x + stats::rnorm(1, mean = 1, sd = 0.5)
    }
        
    c(x,y)
  }))
}
new_pop4 <- new_lollipop4(500)
plot(new_pop4)

lolli4_measure <- get_measures(new_pop4)

2-5. The lollipop that has less longer stick

set.seed(230)
new_lollipop5 <- function(n){
  t(sapply(1:n, function(i){
    # generate "cluster" assignment
    k <- sample(c(1,2), 1)
    
    # generate "ball"
    if(k == 1){
      x <- stats::rnorm(1, mean = 0, sd = 0.5)
      y <- stats::rnorm(1, mean = 1, sd = 0.5)
    } else {
      # k == 2, generate "stick"
      x <- stats::rnorm(1, mean = 1, sd = 1)
      y <- x + stats::rnorm(1, mean = 1, sd = 0.3)
    }
    
    c(x,y)
  }))
}
new_pop5 <- new_lollipop5(500)
plot(new_pop5)

lolli5_measure <- get_measures(new_pop5)

2-6. The lollipop that has less dense stick in the circle

set.seed(230)
new_lollipop6 <- function(n){
  t(sapply(1:n, function(i){
    # generate "cluster" assignment
    k <- sample(c(1,2), 1)
    
    # generate "ball"
    if(k == 1){
      x <- stats::rnorm(1, mean = 0, sd = 1)
      y <- stats::rnorm(1, mean = 0, sd = 1)
    } else {
      # k == 2, generate "stick"
      x <- stats::rnorm(1, mean = 0, sd = 1)
      y <- x + stats::rnorm(1, mean = 0, sd = 0.3)
    }
    
    c(x,y)
  }))
}
new_pop6 <- new_lollipop6(500)
plot(new_pop6)

lolli6_measure <- get_measures(new_pop6)

2-7. The lollipop that has very dense stick in the circle

set.seed(230)
new_lollipop7 <- function(n){
  t(sapply(1:n, function(i){
    # generate "cluster" assignment
    k <- sample(c(1,2), 1)
    
    # generate "ball"
    if(k == 1){
      x <- stats::rnorm(1, mean = 0, sd = 2)
      y <- stats::rnorm(1, mean = 0, sd = 2)
    } else {
      # k == 2, generate "stick"
      x <- stats::rnorm(1, mean = 0, sd = 0.5)
      y <- x + stats::rnorm(1, mean = 0, sd = 0.3)
    }
    
    c(x,y)
  }))
}
new_pop7 <- new_lollipop7(600)
plot(new_pop7)

lolli7_measure <- get_measures(new_pop7)
## Warning in KL.plugin(freqs2d, freqs.null, unit = unit): Vanishing value(s) in
## argument freqs2!

2-8. The lollipop that has multiple clusters in the ball

set.seed(230)
new_lollipop8 <- function(n){
  t(sapply(1:n, function(i){
    # generate "cluster" assignment
    k <- sample(1:5, 1)
    
    # generate "ball"
    if(k == 1){
      x <- stats::rnorm(1, mean = 0.5, sd = 0.5)
      y <- stats::rnorm(1, mean = 0.5, sd = 0.5)
    } else if (k == 2) {
      x <- stats::rnorm(1, mean = -0.5, sd = 0.5)
      y <- stats::rnorm(1, mean = -0.5, sd = 0.5)
    } else if (k == 3) {
      x <- stats::rnorm(1, mean = 0.5, sd = 0.5)
      y <- stats::rnorm(1, mean = -0.5, sd = 0.5)
    } else if (k == 4) {
      x <- stats::rnorm(1, mean = -0.5, sd = 0.5)
      y <- stats::rnorm(1, mean = 0.5, sd = 0.5)
    } else {
      # Otherwise, generate "stick"
      x <- stats::rnorm(1, mean = 1.5, sd = 0.8)
      y <- x + stats::rnorm(1, mean = 0.5, sd = 0.5)
    }
    
    c(x,y)
  }))
}
new_pop8 <- new_lollipop8(700)
plot(new_pop8)

lolli8_measure <- get_measures(new_pop8)
lolli8_measure
##  [1]  0.53934296  0.36347619  0.25379113  0.50476354  0.05230268  0.32225962
##  [7]  0.34178603  0.21055757  0.01286012 -0.20571429

2-9. The lollipop that has multiple clusters in the ball and more weights on stick than previous one

set.seed(230)
new_lollipop9 <- function(n){
  t(sapply(1:n, function(i){
    # generate "cluster" assignment
    k <- sample(1:6, 1)
    
    # generate "ball"
    if(k == 1){
      x <- stats::rnorm(1, mean = 0.5, sd = 0.5)
      y <- stats::rnorm(1, mean = 0.5, sd = 0.5)
    } else if (k == 2) {
      x <- stats::rnorm(1, mean = -0.5, sd = 0.5)
      y <- stats::rnorm(1, mean = -0.5, sd = 0.5)
    } else if (k == 3) {
      x <- stats::rnorm(1, mean = 0.5, sd = 0.5)
      y <- stats::rnorm(1, mean = -0.5, sd = 0.5)
    } else if (k == 4) {
      x <- stats::rnorm(1, mean = -0.5, sd = 0.5)
      y <- stats::rnorm(1, mean = 0.5, sd = 0.5)
    } else {
      # Otherwise, generate "stick"
      x <- stats::rnorm(1, mean = 1, sd = 1)
      y <- x + stats::rnorm(1, mean = 0.5, sd = 0.3)
    }
    
    c(x,y)
  }))
}
new_pop9 <- new_lollipop9(800)
plot(new_pop9)

lolli9_measure <- get_measures(new_pop9)
combined_measure <- rbind(lolli1_measure, lolli2_measure, lolli3_measure,
                          lolli4_measure, lolli5_measure, lolli6_measure,
                          lolli7_measure, lolli8_measure, lolli9_measure)

rownames(combined_measure) <- 1:9
colnames(combined_measure) <- c("pearson", "spearman", "kendall", "dist_cor", "hoeff_D",
                                "MI", "MIC", "XICOR", "HSIC", "Blomq_beta")

combined_measure
##      pearson  spearman   kendall  dist_cor    hoeff_D        MI       MIC
## 1 0.85360836 0.8496716 0.6607615 0.8884833 0.40401811 0.7799179 0.9153044
## 2 0.82197254 0.8256502 0.6452745 0.8336293 0.35008693 0.7097939 0.7308506
## 3 0.63202671 0.6461776 0.4778998 0.6309555 0.18896672 0.4371723 0.4373949
## 4 0.96865997 0.8447599 0.6949579 0.9701998 0.45858001 1.0792695 0.9253863
## 5 0.82823173 0.7293480 0.5559118 0.8087890 0.26510779 0.6273113 0.6310743
## 6 0.44854941 0.4548433 0.3461162 0.4538884 0.09886414 0.3112321 0.3008466
## 7 0.03037599 0.2145965 0.2000779 0.2653970 0.06169904 0.1751000 0.3223880
## 8 0.53934296 0.3634762 0.2537911 0.5047635 0.05230268 0.3222596 0.3417860
## 9 0.65040054 0.5171377 0.3823467 0.6132621 0.12358580 0.3871556 0.4311486
##       XICOR       HSIC Blomq_beta
## 1 0.6018384 0.09836303 -0.8880000
## 2 0.5233341 0.05951117 -0.5840000
## 3 0.3166453 0.03271560 -0.5280000
## 4 0.6392066 0.09954329 -0.7360000
## 5 0.4473378 0.04862815 -0.5520000
## 6 0.1725967 0.01988712 -0.4320000
## 7 0.1593810 0.01630651 -0.3333333
## 8 0.2105576 0.01286012 -0.2057143
## 9 0.3001927 0.02390519 -0.3200000
melt(combined_measure)
##    Var1       Var2       value
## 1     1    pearson  0.85360836
## 2     2    pearson  0.82197254
## 3     3    pearson  0.63202671
## 4     4    pearson  0.96865997
## 5     5    pearson  0.82823173
## 6     6    pearson  0.44854941
## 7     7    pearson  0.03037599
## 8     8    pearson  0.53934296
## 9     9    pearson  0.65040054
## 10    1   spearman  0.84967156
## 11    2   spearman  0.82565025
## 12    3   spearman  0.64617762
## 13    4   spearman  0.84475989
## 14    5   spearman  0.72934804
## 15    6   spearman  0.45484329
## 16    7   spearman  0.21459654
## 17    8   spearman  0.36347619
## 18    9   spearman  0.51713769
## 19    1    kendall  0.66076152
## 20    2    kendall  0.64527455
## 21    3    kendall  0.47789980
## 22    4    kendall  0.69495792
## 23    5    kendall  0.55591182
## 24    6    kendall  0.34611623
## 25    7    kendall  0.20007791
## 26    8    kendall  0.25379113
## 27    9    kendall  0.38234668
## 28    1   dist_cor  0.88848326
## 29    2   dist_cor  0.83362930
## 30    3   dist_cor  0.63095547
## 31    4   dist_cor  0.97019977
## 32    5   dist_cor  0.80878897
## 33    6   dist_cor  0.45388839
## 34    7   dist_cor  0.26539701
## 35    8   dist_cor  0.50476354
## 36    9   dist_cor  0.61326212
## 37    1    hoeff_D  0.40401811
## 38    2    hoeff_D  0.35008693
## 39    3    hoeff_D  0.18896672
## 40    4    hoeff_D  0.45858001
## 41    5    hoeff_D  0.26510779
## 42    6    hoeff_D  0.09886414
## 43    7    hoeff_D  0.06169904
## 44    8    hoeff_D  0.05230268
## 45    9    hoeff_D  0.12358580
## 46    1         MI  0.77991786
## 47    2         MI  0.70979386
## 48    3         MI  0.43717231
## 49    4         MI  1.07926952
## 50    5         MI  0.62731131
## 51    6         MI  0.31123211
## 52    7         MI  0.17510004
## 53    8         MI  0.32225962
## 54    9         MI  0.38715557
## 55    1        MIC  0.91530440
## 56    2        MIC  0.73085065
## 57    3        MIC  0.43739491
## 58    4        MIC  0.92538633
## 59    5        MIC  0.63107431
## 60    6        MIC  0.30084656
## 61    7        MIC  0.32238804
## 62    8        MIC  0.34178603
## 63    9        MIC  0.43114856
## 64    1      XICOR  0.60183841
## 65    2      XICOR  0.52333409
## 66    3      XICOR  0.31664527
## 67    4      XICOR  0.63920656
## 68    5      XICOR  0.44733779
## 69    6      XICOR  0.17259669
## 70    7      XICOR  0.15938100
## 71    8      XICOR  0.21055757
## 72    9      XICOR  0.30019266
## 73    1       HSIC  0.09836303
## 74    2       HSIC  0.05951117
## 75    3       HSIC  0.03271560
## 76    4       HSIC  0.09954329
## 77    5       HSIC  0.04862815
## 78    6       HSIC  0.01988712
## 79    7       HSIC  0.01630651
## 80    8       HSIC  0.01286012
## 81    9       HSIC  0.02390519
## 82    1 Blomq_beta -0.88800000
## 83    2 Blomq_beta -0.58400000
## 84    3 Blomq_beta -0.52800000
## 85    4 Blomq_beta -0.73600000
## 86    5 Blomq_beta -0.55200000
## 87    6 Blomq_beta -0.43200000
## 88    7 Blomq_beta -0.33333333
## 89    8 Blomq_beta -0.20571429
## 90    9 Blomq_beta -0.32000000